# coding=utf8
import requests
from libs import faqs_coll
from bs4 import BeautifulSoup
import re


def decode_html(html):
    return re.sub(r'</?\w+[^>]*>', "\n", html)


def get_faq_list():
    base = "http://dxy.com"
    url = base + "/faq/"
    print(url)

    soup = BeautifulSoup(requests.get(url).text, 'lxml')

    diseases = soup.find_all('td')
    for disease in diseases:
        disease_a = disease.find('a')
        name = disease_a.get_text()
        print(name)
        faqs = get_faqs(base + disease_a['href'])
        faqs_coll.insert_one({
            'Name': name,
            'Faqs': faqs
        })


def get_faqs(url):
    faq_list = []
    n = -1
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    content = soup.find(class_='editor-style')
    content_list = decode_html(str(content)).split("\n\n")
    for p in content_list:
        p = p.strip()
        if p == '':
            continue
        if "转载" in p or "参考文献" in p or "编辑" in p or "责编" in p:
            break
        # 不符合即是回答
        p_type = re.match(r'[1-9]\d*[.]', p)
        if p_type is not None:
            faq_list.append({
                'F': re.sub(r'[1-9]\d*[.]', '', p).strip(),
                'A': ''
            })
            n += 1
        else:
            if n >= 0:
                if faq_list[n]['A'] == '':
                    faq_list[n]['A'] = p
                else:
                    faq_list[n]['A'] += "\n" + p
    return faq_list


if __name__ == '__main__':
    get_faq_list()
